################################################################################
##
## Purpose: This script is an intermediate merge to help further clean the data.
##
## Author: James Bisbee (james.h.bisbee@vanderbilt.edu)
##
##  - Inputs:
##    - ./data/prepped/hearings/cleaned_docs.csv: Prepped data from 1_DATA_hearings_prep.R
##    - ./data/prepped/demographics/child_gender_final.csv: Prepped data from 3_DATA_child_gender_prep.R
##    - ./data/prepped/demographics/kids_to_check_12_25_2021_jb_manual.csv: Manually prepared data from an output of this script (./data/prepped/demographics/kids_to_check_12_25_2021.csv)
##    - ./data/prepped/politicians/politician_prepped_new.csv: Prepped data from 2_DATA_politician_prep.R
##    - ./data/raw/politicians/legislators-historical.csv: Raw data from https://github.com/unitedstates/congress-legislators
##    - ./data/raw/politicians/legislators-current.csv: Raw data from https://github.com/unitedstates/congress-legislators
##    - ./data/prepped/demographics/IG_ratings_new.csv: Prepped data from 0_DATA_IG_scraper.R (not run as part of replication)
##    - ./data/raw/donations/cands##.txt: Raw data on campaign contributions from OpenSecrets (https://www.opensecrets.org/bulk-data)
##    - ./data/prepped/demographics/IG_toclean_JBchecked.csv: Manually prepared data from an output of this script (./data/prepped/demographics/IG_toclean.csv)
##    - ./data/prepped/demographics/child_gender_final_merged.csv: Saved intermediate file from this script 
##  - Outputs:
##    - ./data/prepped/demographics/kids_to_check_12_25_2021.csv
##    - ./data/prepped/demographics/IG_toclean.csv
##    - ./data/prepped/finalData_12_25_2021.RData
##    - ./data/prepped/demographics/finalIG.csv
##    - ./data/prepped/demographics/child_gender_final_merged.csv
##
##
## See associated log file for compute environment, package versions, 
##  and date of most recent run.
##
################################################################################
rm(list = ls())
gc()
require(stringdist)
require(tidyverse)

set.seed(123)

# Compute details
print(paste0('Compute environment from ',Sys.Date(),' run by Bisbee'))
if(Sys.info()['sysname'] == 'Windows') {
  ram_size = system("wmic MemoryChip get Capacity", intern = TRUE)[-1]
  model_name = system("wmic cpu get name", intern = TRUE)[2] # nocov
  vendor_id = system("wmic cpu get manufacturer", intern = TRUE)[2] # nocov
  
  print(list(ram = stringr::str_squish(ram_size)[1],
             vendor_id = stringr::str_squish(vendor_id),
             model_name = stringr::str_squish(model_name),
             no_of_cores = parallel::detectCores()))
} else if(Sys.info()['sysname'] == 'Linuxs') {
  splitted <- strsplit(system("ps -C rsession -o %cpu,%mem,pid,cmd", intern = TRUE), " ")
  df <- do.call(rbind, lapply(splitted[-1], 
                              function(x) data.frame(
                                cpu = as.numeric(x[2]),
                                mem = as.numeric(x[4]),
                                pid = as.numeric(x[5]),
                                cmd = paste(x[-c(1:5)], collapse = " "))))
  df
} else {
  cat("If not on Linux or Windows, you'll have to figure out your own solution to seeing the compute environment.")
}

sessionInfo()


hearings <- read_csv('./data/prepped/hearings/cleaned_docs.csv',col_select = -1)
finalChildren <- read_csv('./data/prepped/demographics/child_gender_final.csv',col_select = -1)

(toMerge <- hearings %>% select(docID,speaker,year,party,distIDcurr,stab,name,opensecretsID,chamber) %>% 
    distinct() %>% 
    mutate(speaker = ifelse(opensecretsID == 'N00003736','Chairman Oxley',speaker)) %>%
    mutate(lname = tolower(gsub('(Chairman|Mr|Mrs|Ms|Senator)(\\.)* |\\.','',speaker))) %>%
    select(name,lname,opensecretsID,stab,distIDcurr,chamber,year) %>% distinct() %>%
    filter(!is.na(stab)))

toMerge <- toMerge  %>%
  mutate(distSimp = ifelse(chamber == 'Senate',gsub('S\\d','S',distIDcurr),gsub('00','01',distIDcurr)),
         clname = gsub(' [A-Z] ',' ',gsub('\\.| II$| III$| IV$| V$| VI$| JR\\.$','',name)))

kids <- finalChildren %>%
  mutate(dist = gsub('CDIR-\\d{4}-\\d{2}-\\d{2}-','',granId)) %>%
  mutate(chamber = ifelse(grepl('-H-',dist),'House','Senate')) %>%
  mutate(clname = gsub('\\.| II$| III$| IV$| V$| VI$| JR$','',name)) %>% 
  mutate(lname = tolower(gsub('^.* ([A-Z])','\\1',clname))) %>% 
  mutate(dist = gsub('.*?-[A-Z]-','',dist)) %>%
  mutate(distIDcurr = paste0(stab,sprintf('%02d',as.numeric(ifelse(stab %in% c('DE','VT') & chamber == 'H','1',dist)))),
         year = as.numeric(str_extract(granId,'\\d{4}')))

# Seems to be a few remaining issues with the kids here. One final manual check required.
preppedKids <- NULL
for(i in 1:nrow(toMerge)) {
  if(toMerge$stab[i] == 'GU') { next }
  
  ref <- kids %>%
    filter(stab == toMerge$stab[i],
           year <= (toMerge$year[i] + 1) | is.na(year))
  
  dists <- stringdist(toMerge$clname[i],
                      ref$clname)
  ref <- ref %>%
    slice(which(dists == min(dists)))
  
  ref2 <- ref %>%
    slice(which.min(toMerge$year[i] - year)) %>% data.frame()
  
  if(nrow(ref2) == 0) {
    ref2 <- ref %>%
      filter(is.na(year)) %>% distinct()
  }
  if(nrow(ref2) != 1) { stop() }    
  toMerge[i,]
  preppedKids <- bind_rows(preppedKids,
                           ref2 %>% select(nKids,nDaughters,nSons,firstDaughter,distIDcurr,year,clname) %>%
                             mutate(matchName = toMerge$clname[i],
                                    opensecretsID = toMerge$opensecretsID[i],
                                    strdist = stringdist(ref2$clname,toMerge$clname[i]),
                                    matchYear = toMerge$year[i],
                                    matchDist = toMerge$distIDcurr[i]))
}


write.csv(preppedKids,file = './data/prepped/demographics/kids_to_check_12_25_2021.csv')


# Looking up the manually-checked mistakes here
finalChildren %>%
  filter(grepl('MURPHY',name)) %>% 
  select(nKids,nDaughters,nSons,firstDaughter,name,granId) %>% 
  arrange(name,granId) %>% data.frame()
kids %>%
  filter(grepl('McADAMS',name)) %>% 
  select(nKids,nDaughters,nSons,firstDaughter,name,year,distIDcurr) %>% 
  arrange(name,year) %>% data.frame()

which(grepl('MURPHY',toMerge$name))

toMerge[377,]

# JB manually fixed some of the issues interacting directly with the .csv file on 
#   12/25/2021. 

# My god...this data prep...insane
cleanedKids <- read_csv('./data/prepped/demographics/kids_to_check_12_25_2021_jb_manual.csv')

merged <- hearings %>%
  left_join(cleanedKids %>%
              select(nKids,nDaughters,nSons,firstDaughter,opensecretsID,year = matchYear))

# Pretty effing good boys! Just the guy from Guam and we can fix him manually!
# Only missing one person from Guam. He appears to have two kids, one elder boy and one younger girl. 
#   https://www.google.com/search?q=michael+san+nicolas+family&sxsrf=AOaemvIgOJOxuaspFg3nt0TvcXYGG_Pn4g:1640444178642&tbm=isch&source=iu&ictx=1&fir=Z4z17LgXOUsSPM%252CK9mwkCgMaczbiM%252C_%253Bs1S5BtHThnpc-M%252CSv6oClCdQzvwUM%252C_%253BtAxHIFNRiwgqdM%252C8nS-13gS-2fO1M%252C_%253BchSSQsY-W-x8hM%252C1XOr3ikihJjSFM%252C_%253BEx0b7kfHJ2hpXM%252C5zW6N29Ju_iIsM%252C_%253B_PXFbYUaEs2vTM%252CK9mwkCgMaczbiM%252C_%253BWswr2Cn3LkJ3oM%252CQN48RBkqPh9arM%252C_%253B-cbIXLp_SUHDpM%252CWnMdMKoGse1dqM%252C_%253BtVWB4PPBUKuW3M%252C2RsWpZtpgsXXEM%252C_%253BKulZUxqpGilyOM%252CQN48RBkqPh9arM%252C_&vet=1&usg=AI4_-kRugmofu5LbbpTxUPDSPTEuS9k17w&sa=X&ved=2ahUKEwiwoKrkmv_0AhXckokEHeJlBlMQ9QF6BAgqEAE#imgrc=s1S5BtHThnpc-M
merged %>%
  filter(is.na(nKids),
         grepl('^N00',opensecretsID))

merged <- merged %>%
  mutate(nKids = ifelse(opensecretsID == 'N00042619',2,nKids),
         nDaughters = ifelse(opensecretsID == 'N00042619',1,nDaughters),
         nSons = ifelse(opensecretsID == 'N00042619',1,nSons),
         firstDaughter = ifelse(opensecretsID == 'N00042619',0,firstDaughter))


# And let's bring in the politician demographics
demogs <- read_csv('./data/prepped/politicians/politician_prepped_new.csv',col_select = -1)

finalMerge <- merged %>%
  left_join(demogs) %>%
  mutate(age = ifelse(opensecretsID == 'FEDGREENSPAN',year - 1926,
                      ifelse(opensecretsID == 'FEDYELLEN',year - 1946,
                             ifelse(opensecretsID %in% c('FEDBERNANKE','FEDPOWELL'),year - 1953,
                                    age))),
         seniority = ifelse(opensecretsID == 'FEDGREENSPAN',year - 1989,
                            ifelse(opensecretsID == 'FEDYELLEN',year - 2014,
                                   ifelse(opensecretsID %in% c('FEDBERNANKE'),year - 2006,
                                          ifelse(opensecretsID %in% c('FEDPOWELL'),year - 2018,
                                          seniority)))),
         nominate_dim1 = ifelse(grepl('FED',opensecretsID),0,nominate_dim1),
         nominate_dim2 = ifelse(grepl('FED',opensecretsID),0,nominate_dim2))


save(finalMerge,file = './data/prepped/finalData_12_25_2021.RData')


# Interpolating missing years
kidsToMerge <- expand.grid(uniqID = unique(paste0(kids$name,kids$dist,kids$stab,kids$chamber)),
                    year = min(kids$year,na.rm=T):max(kids$year,na.rm=T)) %>%
  left_join(kids %>% mutate(uniqID = paste0(name,dist,stab,chamber)) %>% select(-granId) %>% distinct()) %>%
  group_by(uniqID) %>%
  arrange(year) %>%
  fill_(c('lname','dist','nKids','nDaughters','nSons','firstDaughter','type','clname','stab','chamber','name','distIDcurr'),.direction = 'updown') %>%
  ungroup() %>%
  select(-uniqID) %>%
  mutate(distIDcurr = ifelse(name == 'CHRISTOPHER S. MURPHY' & year < 2013,'CT05',
                       ifelse(name == 'CHRISTOPHER S. MURPHY' & year < 2007,'CT16',
                              ifelse(name == 'WM. LACY CLAY','MO07',
                                     ifelse(name == 'LANCE GOODEN' & year < 2020,'TX05',distIDcurr)))),
         chamber = ifelse(name == 'CHRISTOPHER S. MURPHY' & year %in% c(1997:2012),'House',chamber)) %>%
  mutate(distIDcurr = ifelse(chamber == 'Senate',gsub('0','S',distIDcurr),distIDcurr)) %>%
  mutate(distSimp = ifelse(chamber == 'Senate',gsub('S\\d','S',distIDcurr),gsub('00','01',distIDcurr)),
         clname = ifelse(clname == 'FRANK R MASCARA','FRANK MASCARA',
                         ifelse(clname == 'BRAD J SHERMAN','BRAD SHERMAN',
                                # ifelse(clname == 'HAROLD E FORD','HAROLD E FORD',
                                ifelse(clname == 'ROBERT RILEY' & stab == 'AL','BOB RILEY',
                                       ifelse(clname == 'MELQUIADES (MEL) R MARTINEZ','MEL MARTINEZ',
                                              ifelse(clname == 'CAROLYN McCARTHY','CAROLYN MCCARTHY',
                                                     ifelse(clname == 'JOHN H ADLER','JOHN ADLER',
                                                            ifelse(clname == 'ADAM H PUTNAM','ADAM PUTNAM',
                                                                   ifelse(clname == 'DANIEL B MAFFEI','DAN MAFFEI',
                                                                          ifelse(clname == 'CHRISTOPHER JOHN LEE','CHRISTOPHER LEE',
                                                                                 ifelse(clname == 'PAT TOOMEY','PATRICK TOOMEY',
                                                                                        ifelse(clname == 'NAN AS HAYWORTH','NAN S HAYWORTH',
                                                                                               ifelse(clname == 'JIM RENACCI','JAMES RENACCI',
                                                                                                      ifelse(clname == 'STEPHEN FINCHER','STEPHEN LEE FINCHER',
                                                                                                             ifelse(clname == 'JOSEPH S DONNELLY','JOE DONNELLY',
                                                                                                                    ifelse(clname == 'J FRENCH HILL','FRENCH HILL',
                                                                                                                           ifelse(clname == 'WILLIAM LACY CLAY','WM LACY CLAY',
                                                                                                                                  ifelse(grepl('JESUS.*GARCIA',clname),'JESUS GARCIA',
                                                                          gsub(' JR$','',gsub(' [A-Z] ',' ',toupper(clname))))))))))))))))))))) %>%
  select(-distIDcurr,-name)

kidsToMerge %>%
  select(-dist) %>%
  distinct() %>%
  group_by(clname,distSimp,year) %>%
  mutate(n=n()) %>% filter(n > 1) %>%
  arrange(year,clname)


(merged <- toMerge %>%
    mutate(clname = ifelse(clname == "JESUS ``CHUY'' GARCIA",'JESUS GARCIA',
                           ifelse(grepl('^J.*HIMES$',clname),'JAMES HIMES',clname))) %>%
    select(-lname) %>%
    left_join(kidsToMerge))


merged %>%
  group_by(opensecretsID,year) %>%
  mutate(n=n()) %>% filter(n> 1) %>%
  data.frame()

write.csv(merged,file = './data/prepped/demographics/child_gender_final_merged.csv')

# Preparing the legislator file
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

textClnr <- function(x) {
  # x = 'Acevedo-Vilá'
  # enc2utf8(as(iconv(x,from='UTF-8',to = 'LATIN1'),'character'))
  # enc2utf8(as(x, "character"))
  return(stringi::stri_trans_general(str = x, id = "Latin-ASCII"))
}

legsHist <- read_csv('./data/raw/politicians/legislators-historical.csv')
legsHist %>%
  filter(grepl('Linda',first_name))

legsCurr <- read_csv('./data/raw/politicians/legislators-current.csv')
legsCurr %>%
  filter(grepl('Linda',first_name)) %>%
  mutate(last_name = stringi::stri_trans_general(str = last_name, id = "Latin-ASCII"))

legsHist <- legsHist %>%
  filter(birthday > as.Date('1910-01-01')) %>%
  mutate(Name = paste0(first_name,' ',last_name),
         fullNick = paste0(nickname,' ',last_name)) %>%
  select(Name,full_name,last_name,first_name,gender,type,stab = state,
         dist = district,fullNick,opensecretsID = opensecrets_id,bioguide_id)

legsCurr <- legsCurr %>%
  filter(birthday > as.Date('1910-01-01')) %>%
  mutate(Name = paste0(first_name,' ',last_name),
         fullNick = paste0(nickname,' ',last_name)) %>%
  select(Name,full_name,last_name,first_name,gender,type,stab = state,
         dist = district,fullNick,opensecretsID = opensecrets_id,bioguide_id)

legsFull <- bind_rows(legsCurr,legsHist)
test <- textClnr(legsFull$last_name)
test[which(grepl('[^[:alnum:] ]',test))]
legsFull$lname <- trimws(tolower(test))
legsFull$Name <- textClnr(legsFull$Name)
legsFull$full_name <- textClnr(legsFull$full_name)
legsFull$fullNick <- textClnr(legsFull$fullNick)

for(i in 1:nrow(legsFull)) {
  legsFull$fullNick[i] <- gsub('^NA',gsub('Sander','Sandy',
                                          gsub('Gerald','Gerry',
                                               gsub('Joseph','Joe',
                                                    gsub('James','Jim',
                                                         gsub('Marcia','Marcy',
                                                              gsub('Robert','Bob',
                                                                   gsub('William','Bill',
                                                                        gsub('Benjamin','Ben',
                                                                             gsub('Christopher','Chris',
                                                                                  gsub('Timothy','Tim',
                                                                                       gsub('Michael','Mike',
                                                                                            gsub('Daniel','Dan',
                                                                                                 gsub('Susan','Sue',legsFull$first_name[i]))))))))))))),
                               legsFull$fullNick[i])
}

legsFull <- legsFull %>%
  mutate(fullNick = ifelse(fullNick == 'Sander Levin','Sandy Levin',
                           ifelse(fullNick == 'C. Young','Bill Young',fullNick)))

toCheck <- legsFull %>%
  filter(is.na(opensecretsID))

legsFull <- legsFull %>%
  filter(!is.na(opensecretsID))


# Interest group merge
IG <- read_csv('./data/prepped/demographics/IG_ratings_new.csv',col_select = -1) %>%
  mutate(Name = gsub('\\\\xfa','u',gsub('\\\\xf3','o',gsub('\\\\xe1','a',gsub('\\\\xe9','e',utf8::utf8_encode(Name))))))
  

IG %>%
  mutate(Name = utf8::utf8_encode(Name)) %>%
  mutate(inds = row_number()) %>%
  filter(grepl('\\\\',Name)) %>%
  group_by(Name) %>%
  # slice(1) %>%
  select(Name,inds)

legsFull %>%
  filter(grepl('Labrador',fullNick))

IG %>%
  slice(8567)

IGcln <- IG %>%
  mutate(dist = gsub('Sr|Jr','SEN',gsub('At-Large','1',gsub('.* - (.*)','\\1',Office))),
         chamber = ifelse(grepl('House',Office),'H',
                          ifelse(grepl('Senate',Office),'S',NA)),
         clname = gsub('\\.|,| II$| III$| IV$| V$| VI$| Jr| Sr','',Name)) %>% 
  mutate(lname = tolower(gsub('^.* ([A-Z])','\\1',clname)),
         rating = as.numeric(gsub('%','',Rating))) %>%
  select(lname,stab = State,dist,chamber,year,IG,rating,Name,Office) %>%
  mutate(uniqID = paste0(lname,stab,dist,chamber))


# Matching with opensecretsIDs
IGclnMatched <- IGcln %>%
  left_join(legsFull %>% rename(stabLeg = stab,distLeg = dist,chamberLeg = type)) %>% filter(!is.na(opensecretsID))

unmatched <- IGcln %>%
  left_join(legsFull %>% select(-stab,-dist)) %>% filter(is.na(opensecretsID)) %>%
  select(-full_name,-gender,-type,-fullNick,-opensecretsID)



# Preparing the unmatched file
unmatchedSimp <- unmatched %>% select(Name,stab,dist,chamber,lname) %>% distinct()
test2 <- textClnr(unmatchedSimp$lname)
unmatchedSimp[which(grepl('[^[:alnum:] ]',test2)),]
unmatchedSimp$lname <- trimws(tolower(test2))
unmatchedSimp$Name <- textClnr(unmatchedSimp$Name)
# install.packages("stringdist")



cands <- NULL
for(cyc in seq(2000,2020,by = 2)) {
  if(cyc == 2022) { next }
  cands <- bind_rows(cands,as_tibble(read.delim(paste0('./data/raw/donations/cands',substr(as.character(cyc),3,4),'.txt'),
                                                col.names = c('cycle','fecID','opensecretsID','FirstLastP','party','distIDrunfor','distIDcurr',
                                                              'currCand','CycleCand','CRPICO','RecipCode','NoPacs'),sep = ',',quote = '|')))
  
}

cands <- cands %>%
  mutate(stab = substr(distIDcurr,1,2),
         name = toupper(gsub(' \\(.*','',FirstLastP)))

cands$lname <- sapply(str_split(gsub(' (JR|SR|I(I+|V)*)$','',
                                     gsub('\\.','',cands$name)),' '),
                    function(x) tail(x,1))

lookup <- NULL
for(i in 1:nrow(toCheck)) {
  chamb <- toCheck$type[i]
  state <- toCheck$stab[i]
  if(chamb == 'sen') {
    ref <- cands %>%
      filter(substr(distIDcurr,3,3) == 'S') %>%
      select(opensecretsID,FirstLastP,party,distIDcurr,stab,name,lname) %>%
      distinct()
  } else {
    ref <- cands%>%
      filter(grepl('\\d',substr(distIDcurr,3,3) )) %>%
      select(opensecretsID,FirstLastP,party,distIDcurr,stab,name,lname) %>%
      distinct()
  }
  ref <- ref %>%
    filter(lname == toupper(toCheck$last_name[i]))
  if(nrow(ref) == 0) { next }
  if(!is.na(state)) {
    ref <- ref %>%
      filter(stab == state)
  }
  if(nrow(ref) == 0) { next }
  dists <- stringdist(toupper(toCheck$Name[i]),ref$name)
  ref <- ref[which(dists == min(dists)),]
  ref$toCheckName <- toCheck$Name[i]
  ref$dist <- dists[which.min(dists)]
  lookup <- bind_rows(lookup,ref)
  cat(i,'\n')
}

lookup %>%
  select(toCheckName,name,dist,opensecretsID) %>%
  data.frame() %>%
  filter(dist > 0) %>%
  arrange(-dist)

drops <- c('N00030198',
           'N00030267',
           'N00002299',
           'N00000308',
           'N00002383',
           'N00001148',
           'N00001261',
           'N00003535',
           'N00001276',
           'N00000423',
           'N00007390',
           'N00035504',
           'N00027623',
           'N00005090',
           'N00036107',
           'N00024919',
           'N00000684',
           'N00003762',
           'N00031390',
           'N00004884',
           'N00012233',
           'N00002793',
           'N00042240',
           'N00027848',
           'N00038601',
           'N00009954',
           'N00007879',
           'N00007232',
           'N00006983',
           'N00030682',
           'N00004698',
           'N00005645',
           'N00024992',
           'N00007068',
           'N00007833')

lookup <- lookup %>%
  filter(!opensecretsID %in% drops)

legsFull <- bind_rows(legsFull,
                      toCheck %>%
                        select(-opensecretsID) %>%
                        left_join(lookup %>% select(Name = toCheckName,
                                                    opensecretsID)))

unmatchedFixed <- missing <- NULL
for(i in 1:nrow(unmatchedSimp)) {
  # i = 133
  # Sys.sleep(1)
  minDists <- mins <- NULL
  # unmatchedSimp[i,]
  cat(i,': ',unmatchedSimp$Name[i],': ')
  lnm <- gsub('<U+653C><U+3E39>|?\u0094?㸹','e',gsub('?\u0094?㸱','a',iconv(unmatchedSimp$lname[i], "latin1", "UTF-8",sub='')))
  
  if(!is.na(unmatchedSimp$stab[i])) {
    toCheckIn <- legsFull %>%
      filter(tolower(lname) == lnm & stab == unmatchedSimp$stab[i])
  } else {
    toCheckIn <- legsFull %>%
      filter(tolower(lname) == lnm)
  }
  if(nrow(toCheckIn) == 0) { 
    toCheckIn <- legsFull %>%
      filter(grepl(lnm,tolower(lname)))
  } 
  if(nrow(toCheckIn) == 0) {
    missing <- c(missing,i)
    next()
  }
  nm <- iconv(unmatchedSimp$Name[i], "latin1", "UTF-8",sub='')
  
  dists <- stringdist(nm,toCheckIn$Name)
  mins <- c(mins,which.min(dists))
  minDists <- c(minDists,dists[which.min(dists)])
  dists <- stringdist(nm,toCheckIn$full_name)
  mins <- c(mins,which.min(dists))
  minDists <- c(minDists,dists[which.min(dists)])
  dists <- stringdist(nm,toCheckIn$fullNick)
  mins <- c(mins,which.min(dists))
  minDists <- c(minDists,dists[which.min(dists)])
  
  # if(i == 35) { stop() }
  mind <- mins[which.min(minDists)]
  tmp <- cbind(unmatchedSimp[i,],toCheckIn[mind,] %>% rename(NameLeg = Name,
                                                             lnameLeg = lname,
                                                        stabLeg = stab,
                                                        distLeg = dist,
                                                        chamberLeg = type) %>% mutate(matchDist = minDists[which.min(minDists)],
                                                                                                   matchName = NameLeg))
  unmatchedFixed <- bind_rows(unmatchedFixed,tmp)
  cat(toCheckIn$Name[mind],":",minDists[which.min(minDists)],'\n')
}


as_tibble(unmatchedFixed) %>%
  select(Name,matchName,matchDist) %>%
  arrange(-matchDist) %>%
  filter(matchDist > 0) %>%
  data.frame()

unmatchedSimp %>%
  filter(grepl('Christopher Cox',Name))

legsFull %>%
  filter(grepl('John Cox',Name))

write.csv(as_tibble(unmatchedFixed) %>%
            select(Name,matchName,matchDist,opensecretsID) %>%
            filter(matchDist > 0) %>% distinct(),file = './data/prepped/demographics/IG_toclean.csv')


# Manual checking
legsFull %>%
  filter(grepl('Williams',Name)) %>% data.frame()


cleaned <- read_csv('./data/prepped/demographics/IG_toclean_JBchecked.csv')
cleaned <- cleaned %>%
  select(Name,matchName,opensecretsID) %>%
  left_join(legsFull %>% select(matchName = Name,opensecretsID,gender,type,stab,dist) %>% distinct())


as_tibble(unmatchedFixed) %>%
  filter(matchDist > 0) %>%
  select(-opensecretsID,-gender,-matches('Leg$')) %>%
  left_join(cleaned %>% select(-matchName) %>%
              rename(chamberLeg = type,stabLeg = stab,distLeg = dist)) %>%
  filter(is.na(chamberLeg))

fixed <- as_tibble(unmatchedFixed) %>% 
  filter(matchDist == 0) %>%
  bind_rows(as_tibble(unmatchedFixed) %>%
              filter(matchDist > 0) %>%
              select(-opensecretsID,-gender,-matches('Leg$')) %>%
              left_join(cleaned %>% select(-matchName) %>%
                          rename(chamberLeg = type,stabLeg = stab,distLeg = dist)))

fixed <- unmatched %>%
  select(stab,dist,chamber,year,IG,rating,Name) %>%
  mutate(Name = textClnr(Name)) %>%
  left_join(fixed)


IGfinal <- IGclnMatched %>%
  bind_rows(fixed) %>%
  select(opensecretsID,year,IG,rating,Name,stab,dist,chamber,stabLeg,distLeg,chamberLeg)# %>% distinct()

IGfinal <- IGfinal %>%
  mutate(distLeg = ifelse(chamberLeg == 'sen','SEN',
                          ifelse(distLeg == '0','1',distLeg)),
         dist = ifelse(dist == 'Delegate' & stab %in% c('DC','AS','MP','VI','GU'),'1',dist),
         chamberLeg = ifelse(chamberLeg == 'rep','H',
                             ifelse(chamberLeg == 'sen','S',chamberLeg))) 
IGfinal %>%
  filter(stab != stabLeg) %>%
  group_by_at(vars(opensecretsID,Name,matches('stab|dist|chamber'))) %>%
  summarise(n=n()) %>% data.frame()

IGcln <- expand.grid(year = 1995:2021,
                     opensecretsID = unique(IGfinal$opensecretsID)) %>%
  left_join(IGfinal) %>%
  group_by(opensecretsID) %>%
  arrange(year) %>%
  fill_(c('IG','rating','Name','stab','dist','chamber','stabLeg','distLeg','chamberLeg'),.direction = 'updown')


# 18 people in the hearings are not rated by interest groups
toMerge %>%
  left_join(IGcln %>% select(year,opensecretsID,IG,rating,Name),by = c('opensecretsID','year')) %>%
  filter(is.na(rating)) %>%
  select(lname,opensecretsID) %>% distinct() %>% data.frame()

# Final dat to use
finalIG <- toMerge %>%
  left_join(IGcln %>% select(year,opensecretsID,IG,rating,Name),by = c('opensecretsID','year'))

write.csv(finalIG,file = './data/prepped/demographics/finalIG.csv')




# Final children cleaning part DEUX
# Final Cleaning part deux!
children <- read_csv('./data/prepped/demographics/child_gender_final_merged.csv')

children %>%
  group_by(opensecretsID,year) %>%
  summarise(n=n()) %>%
  arrange(-n) %>%
  filter(n > 1) %>%
  select(opensecretsID) %>% distinct() %>%
  data.frame()

children %>%
  filter(opensecretsID == 'N00037161') %>% data.frame()

drops <- children %>%
  filter(opensecretsID == 'N00003758' & clname == 'ANDRE CARSON' |
         opensecretsID == 'N00002097' & clname == 'JOHN W WARNER'|
         opensecretsID == 'N00012508' & clname == 'THOMAS R CARPER' |
         opensecretsID == 'N00026914' & grepl('GWENDOLYNNE ',clname) |
         opensecretsID == 'N00005301' & type == 'manual' | 
         opensecretsID == 'N00006008' & clname == 'RUBEN HINOJOSA' |
         opensecretsID == 'N00009954' & clname == 'JOHN H CHAFEE' |
         opensecretsID == 'N00024871' & nKids == 0 |
         opensecretsID == 'N00029513' & clname == 'JULY CARSON')



children <- children %>%
  filter(!`...1` %in% drops$...1) %>%
  select(-`...1`,-matches('name|type')) %>%
  group_by(opensecretsID,stab,distIDcurr,chamber,year) %>%
  summarise_all(max,na.rm=T)

children %>%
  group_by(opensecretsID,year,distIDcurr) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  filter(n > 1) %>%
  arrange(opensecretsID,year) %>%
  group_by(opensecretsID,year) %>%
  summarise(varKids = var(nKids)) %>%
  ungroup() %>%
  filter(varKids > 0)

children %>%
  filter(opensecretsID == 'N00004118')


write.csv(children,file = './data/prepped/demographics/child_gender_final_merged_cleaned.csv')

# EOF